## Functions to plot entropy in framing choices by language and speaker or item


#  ------------------------------------------------------------------------
#  Entropy functions
#  ------------------------------------------------------------------------

# Basic function to compute empirical entropy of a categorical vector v
# see https://en.wikipedia.org/wiki/Entropy_%28information_theory%29
compute_H <- function(v) {
  x <- table(as.character(v))  # to avoid factor levels with 0 counts
  prob <- x / sum(x)
  H <- - sum(prob * log2(prob))
  H
}


# tailored to the dataframe and needs in the analyses
compute_entropy <- function(df, grouping, DV) {
  # for tidy evaluation using bang operator (!!)
  grouping_sym <- sym(grouping)
  DV_sym       <- sym(DV)
	df %>%
    group_by(language, !!grouping_sym) %>%
	  summarise(H = compute_H(!!DV_sym))
}


#  ------------------------------------------------------------------------
#  Plotting functions
#  ------------------------------------------------------------------------

plot_entropy <- function(df, grouping, DV) {
	
  df_H <- compute_entropy(df = df, grouping = grouping, DV = DV)
  grouping_sym <- sym(grouping)

  my_ylab <- paste(
    "Entropy (H)\nover",
    if (DV == "framing") {
      "framing choices"
    } else if (DV == "manner_expressed") {
      "manner expression"
    } else if (DV == "obj_head") {
      "object name" 
    } else {
      stop("Invalid DV!")
    }
    )
  
  # plot with bootstrapped confidence intervals
  p <- ggplot(
    df_H, aes(x = language, y = H, colour = language, shape = language)) +
    geom_jitter(height = 0, width = jitter_width) +
    stat_summary(
      fun.data = mean_cl_boot,
      geom = "errorbar",
      size = eb_size,
      width = eb_width,
      colour = "black"
      ) +
    xlab("") +
    ylab(my_ylab) +
    ggtitle(paste(grouping, "entropy")) +
    mytheme +
    scale_colour_brewer(palette = "Accent") +
    theme(
    	legend.position = "none",
    	plot.title = element_text(hjust = 0.5),
      text = element_text(size = 9),
      panel.border = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      axis.line.x = element_line(color="black"),
      axis.line.y = element_line(color="black"))
  p
}


# wrapper for multiplot
multiplot_H <- function(df, DV) {
  H_speak <- compute_entropy(descr, "speaker", DV)
  H_event <- compute_entropy(descr, "event", DV)
  myrange <- range(H_speak$H, H_event$H)
  ylimits <- ylim(myrange[1], 1.1 * myrange[2])
  mplot <- arrangeGrob(
    plot_entropy(df, "speaker", DV) + ylimits,
    plot_entropy(df, "event", DV) + ylimits,
    ncol = 2
  )
  grid.draw(mplot)
  mplot
}


#  ------------------------------------------------------------------------
#  Non-parametric (Mann-Whitney-Wilcoxon) tests comparing entropy by language
#  ------------------------------------------------------------------------

test_mannwhit <- function (df, DV) {

  median_test <- function (df, DV, var) {
    H_df <-  compute_entropy(df, var, DV)
    my_median <- H_df %>%
      group_by(language) %>%
      summarise(Median = median(H))

    # Mann-Whitney test is paired for events but not for speakers
    if (var == "speaker") {
      wilcox <- wilcox.test(H ~ language, data = H_df, paired = FALSE)
    } else if (var == "event") {
      wilcox <- wilcox.test(H ~ language, data = H_df, paired = TRUE)
    } else {  # safety measure
      stop("Invalid value for 'var' parameter!")
    }
    
    list(medians = my_median, mann_whitney = wilcox)
  }
  
  out <- list()
  for (myvar in c("speaker", "event")) {
    out[[myvar]] <- median_test(df, DV, myvar)
  }
  
  out
}
